In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import plotly as py 
import plotly.express as px
from plotly import graph_objects as go
from pylab import rcParams

import sklearn
from sklearn.cluster import KMeans
In [2]:
plt.style.use('fivethirtyeight')
%matplotlib inline
# rcParams['figure.figsize'] = 15,6

Data Collection and Analysis¶

In [3]:
df = pd.read_csv('./Data/Mall_Customers.csv')
df.head()
Out[3]:
CustomerID Gender Age Annual Income (k$) Spending Score (1-100)
0 1 Male 19 15 39
1 2 Male 21 15 81
2 3 Female 20 16 6
3 4 Female 23 16 77
4 5 Female 31 17 40
In [4]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   CustomerID              200 non-null    int64 
 1   Gender                  200 non-null    object
 2   Age                     200 non-null    int64 
 3   Annual Income (k$)      200 non-null    int64 
 4   Spending Score (1-100)  200 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 7.9+ KB
In [5]:
# Some visualization of features

plt.figure(figsize = (12, 5))

n = 1

for i in ['Age',	'Annual Income (k$)',	'Spending Score (1-100)']:
    plt.subplot(1, 3, n)
    # plt.subplots_adjust(hspace =0.5 , wspace = 0.5)
    sns.histplot(df[i], bins = 20)
    plt.title('Distribution of {}'.format(i))
    n += 1
    
plt.show()
In [6]:
plt.figure(figsize = (6,4))

sns.countplot(x = 'Gender', data = df, palette= 'hls')
Out[6]:
<AxesSubplot:xlabel='Gender', ylabel='count'>
In [7]:
sns.pairplot(df[['Age',	'Annual Income (k$)',	'Spending Score (1-100)']])

# we can see that the 'Annual Income (k$)',	'Spending Score (1-100)' 
# features are showing the clustering behavior. Age and Spending Score (1-100) are
# also showing some clustering behavior. 

#### We will do the clustering with two subsets of the features. 
#### i)  Annual Income (k$) & Spending Score (1-100)
#### ii) Annual Income (k$), Spending Score (1-100) & Age
Out[7]:
<seaborn.axisgrid.PairGrid at 0x15dbab910>

i) Annual Income (k$) & Spending Score (1-100)¶

In [8]:
# Choosing features for clustering

X1 = df[['Annual Income (k$)', 'Spending Score (1-100)']].values

Choosing the number of clusters¶

WCSS¶

In [9]:
wcss = []

for i in range(1, 11):
    model = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    model.fit(X1)
    wcss.append(model.inertia_)
In [10]:
# Plotting the elbow graph
plt.figure(figsize = (7, 5))
plt.plot(wcss, marker = 'o', alpha = 0.5)
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.title('The Elbow Point Graph')
plt.show()
In [11]:
# We can see that cluster number should be 4 or 5. To be more accurate we should 
# evaluate the model score using Silhouette score for different clustering size.
In [12]:
from sklearn.metrics import silhouette_score

for i in range(3, 7):
    model = KMeans(n_clusters = i, init = 'k-means++', random_state = 42) 
    model.fit(X1)
    score = silhouette_score(X1, model.labels_)
    print("Silhouette score for K = ", i, ": {:.3f}".format(score))


# We can see that, the silhouette score is heighst for K = 5
Silhouette score for K =  3 : 0.468
Silhouette score for K =  4 : 0.493
Silhouette score for K =  5 : 0.554
Silhouette score for K =  6 : 0.540
In [13]:
model = KMeans(n_clusters= 5, init = 'k-means++')
model.fit(X1)

# print(model.get_params())
Out[13]:
KMeans(n_clusters=5)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KMeans(n_clusters=5)
In [14]:
plt.figure(figsize = (7, 6))

# Create scatter plot with different colors for each cluster
fig = px.scatter(x=X1[:,0], y=X1[:,1], color = model.labels_, width = 800, height = 400)
fig.update_layout(xaxis_title = 'Annual Income (K$)', yaxis_title = 'Spending Score (1 - 100)', 
                  title = 'KMeans Clustering Results', title_x=0.5, title_y=0.95, coloraxis_showscale=False)

# Add cluster centroids as black X
centroids = model.cluster_centers_
fig.add_scatter(x=centroids[:, 0], y=centroids[:, 1],
                mode='markers', marker=dict(size=10, color='black', symbol='x'))

# Show plot
fig.show()
<Figure size 504x432 with 0 Axes>

ii) Annual Income (k$), Spending Score (1-100) & Age¶

In [15]:
X2 = df[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']].values
In [16]:
wcss2 = []

for i in range(1, 11):
    model = KMeans(n_clusters= i, init = 'k-means++', random_state= 42)
    model.fit(X2)
    wcss2.append(model.inertia_)
In [17]:
plt.figure(figsize=(7,5))
plt.plot(wcss2, marker = 'o', alpha = 0.5)
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.title('The Elbow Point Graph ')
plt.show()

# We can see that cluster number could be 4, 5 or 6. To be more accurate we should 
# evaluate the model score using Silhouette score for different clustering size.
In [18]:
for i in range(4, 8):
    model = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    model.fit(X2)
    score = silhouette_score(X2, model.labels_)
    print("silhouette_score for K = ", i, ": {:.3f}".format(score))

# We can see that, the silhouette score is heighst for K = 6, 
# but it's not much higher than K = 5. So, we will use K = 5.
silhouette_score for K =  4 : 0.405
silhouette_score for K =  5 : 0.444
silhouette_score for K =  6 : 0.452
silhouette_score for K =  7 : 0.440
In [19]:
model = KMeans(n_clusters = 5, init = 'k-means++', random_state = 42)
model.fit(X2)

# print(model.get_params())
Out[19]:
KMeans(n_clusters=5, random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KMeans(n_clusters=5, random_state=42)
In [20]:
# label2 = model.labels_
# centroids2 = model.cluster_centers_

# trace1 = go.Scatter3d(x= df['Age'],
#                       y= df['Spending Score (1-100)'],
#                       z= df['Annual Income (k$)'],
#                       mode='markers', marker=dict(color = label2, size= 20, 
#                       line=dict(color= label2, width= 12),opacity=0.8))

# trace2 = go.Scatter3d(x=centroids2[:,0],
#                       y=centroids2[:,1],
#                       z=centroids2[:,2],
#                       mode='markers', marker=dict(color='black', size = 10, symbol = 'circle'))

# data = [trace1, trace2]

# layout = go.Layout(title= 'Clusters', title_x=0.5, title_y=0.95,
#     scene = dict(
#             xaxis = dict(title  = 'Age'),
#             yaxis = dict(title  = 'Spending Score'),
#             zaxis = dict(title  = 'Annual Income')),
#     margin = dict(l = 10, r = 10, b = 10, t = 10))

# fig = go.Figure(data=data, layout=layout)

# py.offline.iplot(fig)
In [22]:
import plotly.express as px

label2 = model.labels_
centroids2 = model.cluster_centers_

df['labels'] = label2

fig = px.scatter_3d(df, x='Age', y='Spending Score (1-100)', 
                    z='Annual Income (k$)', color='labels', 
                    opacity = 0.6, width = 1000, height = 600)

fig.update_traces(marker_size = 15)

fig.add_scatter3d(x=centroids2[:,0], y=centroids2[:,1], z=centroids2[:,2], 
                  mode='markers', marker=dict(size=10, color='black', symbol='circle'))

fig.update_layout(title = 'KMeans Clustering Results', title_x=0.5, 
                  title_y=0.95, coloraxis_showscale=False)

fig.show()
In [ ]: